rm(list=ls())
#source("/data_genome1/SharedSoftware/inhouse_development/R/FunctionalEnrichmentHelpers.R")
library(limma)
library(xlsx)
library(reshape2)
library(pheatmap)
library(GSA)
library(xtable)
library(gplots)
library(impute)
library(gplots)
library(RColorBrewer)
library(scatterplot3d)
library(naturalsort)
library(ggplot2)
library(data.table)
library(GenomicRanges)
library(pander)
#library(VennDiagram)
#flog.threshold(ERROR) # to stop VennDiagram writing a log file for each plot
library("rrcov")
library("rms")
library(org.Hs.eg.db)
library(DESeq2)
library(survival)
library(knitr)
source("../Human/Helper_functions.R")
library(BiocParallel)
result_folder = "../../Results/TCGA/"
if (!file.exists(result_folder)) dir.create(result_folder, recursive=T)
load("../../Results/Human/DGE_analysis_image.Rdata")
name_trans = c("early_vs_whole_unpaired"="ecto_early_vs_whole_unpaired", "ground_state_vs_whole_all"="ecto_2D_vs_whole_all", "ecto_vs_endo"="ecto_vs_endo_whole")
names(all_results) = ifelse(names(all_results) %in% names(name_trans), name_trans[names(all_results)], names(all_results))
load("../../Results/Human/MB194_Human_gene_lists_and_signatures.Rdata")
load("../../Data/External/CervicalCancer_2017_Data/RNASeq/TCGA_Cervix_RNASeq_v2_counts_2019-04-10.Rdata")
colnames(ed_TCGA) = gsub("SampleID", "sample_barcode", colnames(ed_TCGA))
ed_TCGA_complete = ed_TCGA
ed_TCGA = ed_TCGA[!duplicated(ed_TCGA$sample_barcode),]
ed_TCGA$patient_barcode = unlist(sapply(strsplit(ed_TCGA$sample_barcode, "-"), function(x) paste(x[1:3],collapse="-")))
ed_TCGA$iCluster_k3_named = with(ed_TCGA, ifelse(SAMP.iCluster_All_k3=="C3","Adenocarcinoma", ifelse(SAMP.iCluster_All_k3=="C2","Keratin-high","Keratin-low")))
size_factors = estimateSizeFactorsForMatrix(expr_mat_TCGA)
expr_mat_TCGA_norm = sweep(expr_mat_TCGA, 2, size_factors, "/")
ed$ShortName_unique = paste(ed$ShortName, ed$patient_or_mouse_id, ed$Passage, sep="_")
ed_MB194_shortname = ed
rownames(ed_MB194_shortname) = ed_MB194_shortname$ShortName_unique
Data has been obtained from TCGA data companion site for the TCGA 2017 publication on CESC (https://tcga-data.nci.nih.gov/docs/publications/cesc_2016/). Level 3 processed RNASeq_v2 data was downloaded and per gene expression levels were extracted from "*.rsem.genes.normalized_results" files using custom scripts. Sample annotations for those samples was also obtained from the companion web site.
The TCGA Cervical cancer project includes both squamous carcinoma and adenocarcinoma of the cervix. We use normalised per-gene FPKM values as generated by TCGA. The subtypes of cervical adenocarcinoma are merged to a single class for simplicity.
sel_samples = subset(ed_TCGA, !duplicated(sample_barcode) & !is.na(CLIN.Dx_merged))$sample_barcode
ed_TCGA_orig = ed_TCGA
ed_TCGA = ed_TCGA[sel_samples,]
In total we included 178 samples. Histological diagnosis was distributed as follows.
table(ed_TCGA[sel_samples, ]$CLIN.Dx_merged, useNA="ifany")
##
## Adenocarcinoma Adenosquamous Squamous
## 31 3 144
ed_TCGA$surv_status = ifelse(ed_TCGA$CLIN.vital_status=="Dead",1,0)
ed_TCGA$surv_time = as.numeric(ifelse(ed_TCGA$CLIN.vital_status=="Dead", ed_TCGA$CLIN.days_to_death, ed_TCGA$CLIN.days_to_last_known_alive))
ed_TCGA$histology_class = factor(ifelse(ed_TCGA$CLIN.Dx_merged=="Squamous", "Cervical Squamous Cell Carcinoma", "Cervical Adenocarcinoma"))
ed_TCGA$"Histological type" = ed_TCGA$histology_class
ed_tcga_sample_bc = ed_TCGA[sel_samples,]
rownames(ed_tcga_sample_bc) = ed_tcga_sample_bc$sample_barcode
We here check and classify samples according to their KRT5 and KRT7, KRT8 expression.
sel_genes = symbol_to_entrez(c("KRT5","KRT7", "KRT8"))
## Warning in dcast(select(org.Hs.eg.db, keys = tmp, keytype = "SYMBOL", columns = "ENTREZID"), : The dcast generic in data.table has been passed a data.frame and will attempt to redirect to the
## reshape2::dcast; please note that reshape2 is deprecated, and this redirection is now deprecated as well. Please do this redirection yourself like reshape2::dcast(select(org.Hs.eg.db, keys = tmp, keytype
## = "SYMBOL", columns = "ENTREZID")). In the next version, this warning will become an error.
rownames(sel_genes) = sel_genes$EntrezID
norm_mat_sig_genes = log2(expr_mat_TCGA_norm[rownames(expr_mat_TCGA_norm) %in% as.character(sel_genes$EntrezID), sel_samples]+1e-1)
rownames(norm_mat_sig_genes) = sel_genes[as.character(rownames(norm_mat_sig_genes)),"EntrezID"]
ee = ed_TCGA[colnames(norm_mat_sig_genes),]
colnames(norm_mat_sig_genes) = paste(ee$sample_barcode,sep="_")
norm_mat_sig_genes_sorted2 = norm_mat_sig_genes[,order(colnames(norm_mat_sig_genes))]
ed_sorted = ed_tcga_sample_bc[colnames(norm_mat_sig_genes_sorted2), ]
col_anno_df = ed_sorted[,c("Histological type","CLIN.Dx_merged")]
rownames(col_anno_df) = ed_sorted$sample_barcode
tmp = norm_mat_sig_genes_sorted2
#tmp2 = tmp-matrix(rep(apply(tmp,2,mean), each=nrow(tmp)), nrow=nrow(tmp), ncol=ncol(tmp))
plot(as.numeric(tmp["3852", order(tmp["3852",])]), col="red", pch=20, xlab="Sample index", ylab="Expression minus mean KRT5+KRT7+KRT8 log2 expression level", ylim=c(min(tmp), max(tmp)))
points(as.numeric(tmp["3855", order(tmp["3852",])]), col="blue", pch=20)
points(as.numeric(tmp["3856", order(tmp["3852",])]), col="orange", pch=20)
legend(x=30,y=7, legend=c("KRT5","KRT7","KRT8"), fill=c("red","blue","orange"))
plot(as.matrix(tmp)["3852",], as.matrix(tmp)["3855",], xlab="KRT5 expression level", ylab="KRT7 expression level", col=as.numeric(col_anno_df[colnames(tmp),"Histological type"]), pch=20, main="KRT5 vs. KRT7")
legend("bottomleft",legend=levels(col_anno_df[colnames(tmp),"Histological type"]), fill=as.numeric(factor(levels(col_anno_df[colnames(tmp),"Histological type"]))))
plot(as.matrix(tmp)["3852",], as.matrix(tmp)["3856",], xlab="KRT5 expression level", ylab="KRT8 expression level", col=as.numeric(col_anno_df[colnames(tmp),"Histological type"]), pch=20, main="KRT5 vs. KRT8")
legend("bottomleft",legend=levels(col_anno_df[colnames(tmp),"Histological type"]), fill=as.numeric(factor(levels(col_anno_df[colnames(tmp),"Histological type"]))))
plot(as.matrix(tmp)["3855",], as.matrix(tmp)["3856",], xlab="KRT7 expression level", ylab="KRT8 expression level", col=as.numeric(col_anno_df[colnames(tmp),"Histological type"]), pch=20, main="KRT7 vs. KRT8")
legend("bottomleft",legend=levels(col_anno_df[colnames(tmp),"Histological type"]), fill=as.numeric(factor(levels(col_anno_df[colnames(tmp),"Histological type"]))))
par(mfrow=c(1,3))
hist(as.matrix(norm_mat_sig_genes_sorted2)["3852",], 40, xlab="KRT5 expression", main="")
abline(v=16, col="red", lwd=2)
hist(as.matrix(norm_mat_sig_genes_sorted2)["3855",], 40, xlab="KRT7 expression", main="")
abline(v=12, col="red", lwd=2)
hist(as.matrix(norm_mat_sig_genes_sorted2)["3856",], 40, xlab="KRT8 expression", main="")
abline(v=13, col="red", lwd=2)
par(mfrow=c(1,1))
We observe a globally inverse relationship between KRT5 and KRT7 (r = -0.3539802), with a relevant portion shwoing medium to high expression for both KRT5 and KRT7.
We set the cutoffs for KRT5 to 16, for KRT7 to 12 and KRT8 to 13.
Here we explore the correlation between KRT5/7 expression and histological diagnosis.
sel_genes = symbol_to_entrez(c("KRT5","KRT7"))
## Warning in dcast(select(org.Hs.eg.db, keys = tmp, keytype = "SYMBOL", columns = "ENTREZID"), : The dcast generic in data.table has been passed a data.frame and will attempt to redirect to the
## reshape2::dcast; please note that reshape2 is deprecated, and this redirection is now deprecated as well. Please do this redirection yourself like reshape2::dcast(select(org.Hs.eg.db, keys = tmp, keytype
## = "SYMBOL", columns = "ENTREZID")). In the next version, this warning will become an error.
rownames(sel_genes) = sel_genes$EntrezID
norm_mat_sig_genes = log2(expr_mat_TCGA_norm[rownames(expr_mat_TCGA_norm) %in% as.character(sel_genes$EntrezID), sel_samples]+1e-1)
rownames(norm_mat_sig_genes) = sel_genes[as.character(rownames(norm_mat_sig_genes)),"EntrezID"]
ee = ed_TCGA[colnames(norm_mat_sig_genes),]
colnames(norm_mat_sig_genes) = paste(ee$sample_barcode,sep="_")
norm_mat_sig_genes_sorted2 = norm_mat_sig_genes[,order(colnames(norm_mat_sig_genes))]
ed_sorted = ed_tcga_sample_bc[colnames(norm_mat_sig_genes_sorted2), ]
col_anno_df = ed_sorted[,c("Histological type","CLIN.Dx_merged")]
rownames(col_anno_df) = ed_sorted$sample_barcode
anno_colors = list()
anno_colors[["Up/Down regulation"]] = c("Higher in Ecto"="Red","Higher in Endo"="Blue")
anno_colors[["Histological type"]] = c("Cervical Squamous Cell Carcinoma"="Orange", "Cervical Adenocarcinoma"="Green")
nmat_scaled = t(scale(t(norm_mat_sig_genes_sorted2)))
nmat_scaled = nmat_scaled[apply(is.na(nmat_scaled), 1, sum)==0, ]
nmat_scaled[abs(nmat_scaled) > 3] <- sign(nmat_scaled[abs(nmat_scaled) > 3]) * 3
breaks_new = c(-5, seq(-1,1,2/98), 5)
pheatmap(nmat_scaled, cluster_rows = F, cluster_cols = T, show_rownames = T, scale="none", main="KRT5/7, TCGA Cervical cancer samples", breaks=breaks_new, annotation_colors = anno_colors, labels_row = sel_genes[rownames(nmat_scaled),"GeneSymbol"], annotation_col = col_anno_df, show_colnames = F, clustering_method = "average")
pheatmap(norm_mat_sig_genes_sorted2, cluster_rows = F, cluster_cols = T, show_rownames = T, scale="none", main="KRT5/7, TCGA Cervical cancer samples, absolute counts", annotation_colors = anno_colors, labels_row = sel_genes[rownames(nmat_scaled),"GeneSymbol"], annotation_col = col_anno_df, show_colnames = F)
tmp = norm_mat_sig_genes_sorted2
tmp[1,] = ifelse(tmp[1,]< 16,0,1)
tmp[2,] = ifelse(tmp[2,]< 12,0,1)
pheatmap(tmp, cluster_rows = F, cluster_cols = T, show_rownames = T, scale="none", main="KRT5/7, TCGA Cervical cancer samples, using fixed thresholds", annotation_colors = anno_colors, labels_row = sel_genes[rownames(nmat_scaled),"GeneSymbol"], annotation_col = col_anno_df, show_colnames = F)
Here we explore the correlation between KRT5/7 expression and histological diagnosis.
sel_genes = symbol_to_entrez(c("KRT5","KRT7", "KRT8"))
## Warning in dcast(select(org.Hs.eg.db, keys = tmp, keytype = "SYMBOL", columns = "ENTREZID"), : The dcast generic in data.table has been passed a data.frame and will attempt to redirect to the
## reshape2::dcast; please note that reshape2 is deprecated, and this redirection is now deprecated as well. Please do this redirection yourself like reshape2::dcast(select(org.Hs.eg.db, keys = tmp, keytype
## = "SYMBOL", columns = "ENTREZID")). In the next version, this warning will become an error.
rownames(sel_genes) = sel_genes$EntrezID
norm_mat_sig_genes = log2(expr_mat_TCGA_norm[rownames(expr_mat_TCGA_norm) %in% as.character(sel_genes$EntrezID), sel_samples]+1e-1)
rownames(norm_mat_sig_genes) = sel_genes[as.character(rownames(norm_mat_sig_genes)),"EntrezID"]
ee = ed_TCGA[colnames(norm_mat_sig_genes),]
colnames(norm_mat_sig_genes) = paste(ee$sample_barcode,sep="_")
norm_mat_sig_genes_sorted2 = norm_mat_sig_genes[,order(colnames(norm_mat_sig_genes))]
ed_sorted = ed_tcga_sample_bc[colnames(norm_mat_sig_genes_sorted2), ]
anno_colors = list()
anno_colors[["Up/Down regulation"]] = c("Higher in Ecto"="Red","Higher in Endo"="Blue")
anno_colors[["Histological type"]] = c("Cervical Squamous Cell Carcinoma"="Orange", "Cervical Adenocarcinoma"="Green")
anno_colors[["Keratinizing"]] = c("Keratinizing squamous cell carcinoma"="red", "Non-keratinizing squamous cell carcinoma" = "lightgreen", "non-squamous cell carcinoma"="darkgreen", "n.a."="grey")
nmat_scaled = t(scale(t(norm_mat_sig_genes_sorted2)))
nmat_scaled = nmat_scaled[apply(is.na(nmat_scaled), 1, sum)==0, ]
nmat_scaled[abs(nmat_scaled) > 3] <- sign(nmat_scaled[abs(nmat_scaled) > 3]) * 3
breaks_new = c(-5, seq(-1,1,2/98), 5)
pheatmap(nmat_scaled, cluster_rows = F, cluster_cols = T, show_rownames = T, scale="none", main="KRT5/7, TCGA Cervical cancer samples", breaks=breaks_new, annotation_colors = anno_colors, labels_row = sel_genes[rownames(nmat_scaled),"GeneSymbol"], annotation_col = col_anno_df, show_colnames = F, clustering_method = "average")
pheatmap(norm_mat_sig_genes_sorted2, cluster_rows = F, cluster_cols = T, show_rownames = T, scale="none", main="KRT5/7, TCGA Cervical cancer samples, absolute counts", annotation_colors = anno_colors, labels_row = sel_genes[rownames(nmat_scaled),"GeneSymbol"], annotation_col = col_anno_df, show_colnames = F)
tmp = norm_mat_sig_genes_sorted2
tmp["3852",] = ifelse(tmp["3852",]< 16,0,1)
tmp["3855",] = ifelse(tmp["3855",]< 12,0,1)
tmp["3856",] = ifelse(tmp["3856",]< 13,0,1)
pheatmap(tmp, cluster_rows = F, cluster_cols = T, show_rownames = T, scale="none", main="KRT5/7, TCGA Cervical cancer samples, using fixed thresholds", annotation_colors = anno_colors, labels_row = sel_genes[rownames(nmat_scaled),"GeneSymbol"], annotation_col = col_anno_df, show_colnames = F)
sel_genes = symbol_to_entrez(c("KRT5","KRT7","KRT8"))
## Warning in dcast(select(org.Hs.eg.db, keys = tmp, keytype = "SYMBOL", columns = "ENTREZID"), : The dcast generic in data.table has been passed a data.frame and will attempt to redirect to the
## reshape2::dcast; please note that reshape2 is deprecated, and this redirection is now deprecated as well. Please do this redirection yourself like reshape2::dcast(select(org.Hs.eg.db, keys = tmp, keytype
## = "SYMBOL", columns = "ENTREZID")). In the next version, this warning will become an error.
rownames(sel_genes) = sel_genes$EntrezID
krt_expr = as.data.frame(t(log2(expr_mat_TCGA_norm[rownames(expr_mat_TCGA_norm) %in% as.character(sel_genes$EntrezID), sel_samples ]+1e-1)))
colnames(krt_expr) = sel_genes[colnames(krt_expr),"GeneSymbol"]
krt_expr$KRT5_class = ifelse(krt_expr[,"KRT5"]<16,"low","high")
krt_expr$KRT7_class = ifelse(krt_expr[,"KRT7"]<12,"low","high")
krt_expr$KRT8_class = ifelse(krt_expr[,"KRT8"]<13,"low","high")
rownames(krt_expr) = ed_TCGA[rownames(krt_expr), "sample_barcode"]
ed_TCGA = merge(ed_TCGA, krt_expr, by.x="sample_barcode", by.y=0, all.x=T, sort=F)
rownames(ed_TCGA) = ed_TCGA$Derived.Data.File
ed_TCGA$KRT_class = ifelse(ed_TCGA$KRT5_class=="high" & ed_TCGA$KRT7_class=="high","KRT5+/7+", ifelse(ed_TCGA$KRT5_class=="low" & ed_TCGA$KRT7_class=="low", "KRT5-/7-", ifelse(ed_TCGA$KRT5_class=="high","KRT5+/7-","KRT5-/7+")))
In general there seems to be a good correlation between \(KRT5_{low}\)/\(KRT7_{high}\) and the diagnosis of Cervical Adenocarcinoma, while \(KRT5_{high}\) cases seem to be squamous irrespective of KRT7 status.
We here check and classify samples according to their TP63 expression.
sel_genes = symbol_to_entrez(c("KRT5","KRT7","TP63"))
## Warning in dcast(select(org.Hs.eg.db, keys = tmp, keytype = "SYMBOL", columns = "ENTREZID"), : The dcast generic in data.table has been passed a data.frame and will attempt to redirect to the
## reshape2::dcast; please note that reshape2 is deprecated, and this redirection is now deprecated as well. Please do this redirection yourself like reshape2::dcast(select(org.Hs.eg.db, keys = tmp, keytype
## = "SYMBOL", columns = "ENTREZID")). In the next version, this warning will become an error.
rownames(sel_genes) = sel_genes$EntrezID
norm_mat_sig_genes = log2(expr_mat_TCGA_norm[rownames(expr_mat_TCGA_norm) %in% as.character(sel_genes$EntrezID), sel_samples]+1e-1)
rownames(norm_mat_sig_genes) = sel_genes[as.character(rownames(norm_mat_sig_genes)),"EntrezID"]
ee = ed_TCGA[colnames(norm_mat_sig_genes),]
#colnames(norm_mat_sig_genes) = paste(ee$sample_barcode,sep="_")
norm_mat_sig_genes_sorted2 = norm_mat_sig_genes[,order(colnames(norm_mat_sig_genes))]
ed_sorted = ed_tcga_sample_bc[colnames(norm_mat_sig_genes_sorted2), ]
col_anno_df = ed_sorted[,c("Histological type","CLIN.Dx_merged")]
rownames(col_anno_df) = ed_sorted$sample_barcode
tmp = norm_mat_sig_genes_sorted2
plot(as.matrix(tmp)["3852",], as.matrix(tmp)["8626",], xlab="KRT5 expression level", ylab="TP63 expression level", col=as.numeric(col_anno_df[colnames(tmp),"Histological type"]), pch=20)
legend("topleft",legend=levels(col_anno_df[colnames(tmp),"Histological type"]), fill=as.numeric(factor(levels(col_anno_df[colnames(tmp),"Histological type"]))))
# library(mixtools)
# nm = normalmixEM(as.matrix(norm_mat_sig_genes_sorted2)["8626",], k=2)
# plot(nm, which=2)
par(mfrow=c(1,1))
hist(as.matrix(norm_mat_sig_genes_sorted2)["8626",], 40, xlab="TP63 expression", main="")
abline(v=10, col="red", lwd=2)
par(mfrow=c(1,1))
TP63_expr = as.data.frame(t(log2(expr_mat_TCGA_norm[rownames(expr_mat_TCGA_norm) %in% c("8626"), sel_samples, drop=F ]+1e-1)))
colnames(TP63_expr) = sel_genes[colnames(TP63_expr),"GeneSymbol"]
TP63_expr$TP63_class = ifelse(TP63_expr[,"TP63"]<10,"low","high")
ed_TCGA = merge(ed_TCGA, TP63_expr, by.x="sample_barcode", by.y=0, all.x=T, sort=F)
rownames(ed_TCGA) = ed_TCGA$sample_barcode
image_file = paste(result_folder, "Supp Fig8a TCGA KRT5 KRT7 KRT8 TP63 thresholds.png" ,sep="/")
png(image_file, width=2000, height=1000)
par(mfrow=c(1,4), cex=3)
hist(ed_TCGA$KRT5, 40, xlab="KRT5 expression", main="")
abline(v=16, col="red", lwd=3)
hist(ed_TCGA$KRT7, 40, xlab="KRT7 expression", main="")
abline(v=12, col="red", lwd=3)
# hist(ed_TCGA$KRT8, 40, xlab="KRT8 expression", main="")
# abline(v=13, col="red", lwd=3)
hist(ed_TCGA$TP63, 40, xlab="TP63 expression", main="")
abline(v=10, col="red", lwd=3)
par(mfrow=c(1,1))
dev.off()
## png
## 2
Here we correlate (Spearman) the z-scored expression values from the TCGA dataset for each sample with the vector of fold-changes from MB194 Ecto vs. Endo for the same genes. Positive correlations correspond to agreement with the Ecto signature while negative ones agree better with the Endo signature.
We define the Endo class as samples with score < -0.2, Ecto as > 0.2 and everything between as undertermined.
# Ecto Endo Classifier
sel_genes = dedup_DGE_results(subset(all_gene_signatures[["MB194_Ecto_vs_Endo_Diff"]], !is.na(EntrezID)))
rownames(sel_genes) = sel_genes$EntrezID
e2s = unique(sel_genes[,c("EntrezID","GeneSymbol")])
norm_mat_sig_genes = log2(expr_mat_TCGA_norm[rownames(expr_mat_TCGA_norm) %in% as.character(sel_genes$EntrezID), sel_samples]+1e-1)
rownames(norm_mat_sig_genes) = e2s[as.character(rownames(norm_mat_sig_genes)),"EntrezID"]
ee = ed_TCGA[colnames(norm_mat_sig_genes),]
colnames(norm_mat_sig_genes) = paste(ee$sample_barcode,sep="_")
nmat_scaled = t(scale(t(norm_mat_sig_genes)))
nmat_scaled = nmat_scaled[apply(is.na(nmat_scaled), 1, sum)==0, ]
# Simple counting aggreement classifier
#zz = apply(nmat_scaled, 2, function(x) { sum(sign(x)*sign(row_anno[names(x),"logFC"])) } )/nrow(nmat_scaled)
# Correlation of logFC from Ecto/Endo with z-scored expression values
endoecto_cor_with_zscore = apply(nmat_scaled, 2, function(x) { cor(x, sel_genes[names(x),"logFC"], method="spearman", use="pairwise") } )
ed_TCGA$ecto_endo_score = endoecto_cor_with_zscore[ed_TCGA$sample_barcode]
plot(density(ed_TCGA$ecto_endo_score, na.rm=T), xlab="Ecto vs. Endo Correlation Score", main="Distribution of Ecto vs. Endo correlation score", sub="(Red lines: thresholds for classifier)" )
abline(v=c(-0.2,0.2), col="red", lty=2)
#library(mixtools)
#nm = normalmixEM(ed_TCGA$ecto_endo_score[!is.na(ed_TCGA$ecto_endo_score)], k=3)
#plot(nm, which=2)
# cutoffs selected visually from estimated mix model with 3 classes
ed_TCGA$EctoEndoClass = ifelse(ed_TCGA$ecto_endo_score < -0.2, "Endo", ifelse(ed_TCGA$ecto_endo_score > 0.2, "Ecto", "undetermined"))
We here draw randomly 1000 gene sets of same size as the Ecto/Endo differentially expressed genes from MB194, assign the same fold changes as in the Ecto/Endo signature randomly to the genes and compute the correlation between fold changes and gene expression Z-scores for each sample. We then compare the distribution of the per-sample correlations obtained for the Ecto-Endo Diff genes with the distribution of per-sample correlations for the 1000 random gene sets.
bpparam = MulticoreParam(worker=25, manager.hostname="127.0.0.1")
rerun_random_sampling = TRUE
if(rerun_random_sampling) {
all_random_samples = list()
expr_values = log2(expr_mat_TCGA_norm[, sel_samples]+1e-1)
nmat_scaled = t(scale(t(expr_values)))
nmat_scaled = nmat_scaled[apply(is.na(nmat_scaled), 1, sum)==0, ]
sim_fun <- function(x) {
sel_genes_random = sample.int(nrow(nmat_scaled), nrow(sel_genes))
curr_endoecto_cor_with_zscore = apply(nmat_scaled[sel_genes_random,], 2, function(x) { cor(x, sel_genes[,"logFC"], method="spearman", use="pairwise") } )
return(curr_endoecto_cor_with_zscore)
}
all_random_samples = bplapply(1:1000, sim_fun, BPPARAM = bpparam)
all_random_samples_mat = as.matrix(do.call(cbind, all_random_samples))
save(all_random_samples_mat,file=file.path(result_folder, "TCGA_ecto_endo_classifier_random_sampling.Rdata"))
} else {
load(file.path(result_folder, "TCGA_ecto_endo_classifier_random_sampling.Rdata"))
}
hist(all_random_samples_mat,100)
plot(density(ed_TCGA$ecto_endo_score, na.rm=T), col="red", ylim=c(0,2))
lines(density(all_random_samples_mat), col="black")
legend("topright",legend=c("Spearman R of 1000 random gene sets","Spearman R for EctoEndo Diff genes"), fill=c("black","red"))
abline(v=c(-0.2,0.2), lty=2)
image_file = paste(result_folder, "Supp Fig8b TCGA EctoEndoScore.png" ,sep="/")
png(image_file, width=1200, height=1000)
par(mfrow=c(1,1), cex=3)
plot(density(ed_TCGA$ecto_endo_score, na.rm=T), xlab="Squamous vs. Columnar Correlation Score", main="Distribution of Squamous vs. Columnar correlation score", sub="(Red lines: thresholds for classifier)" )
abline(v=c(-0.2,0.2), col="red", lty=2, lwd=3, ylim=c(0,2))
lines(density(all_random_samples_mat), col="green")
legend("topleft",legend=c("Spearman R of 1000 random gene sets","Spearman R for EctoEndo Diff genes"), fill=c("green","red"), cex=0.5)
par(mfrow=c(1,1))
dev.off()
## png
## 2
par(mar=c(15,4,4,2))
boxplot(ed_TCGA$ecto_endo_score ~ ed_TCGA$KRT_class, ylab="Ecto Score (>0: rather Ecto; <0: rather Endo)", las=2)
boxplot(ed_TCGA$ecto_endo_score ~ ed_TCGA$histology_class, ylab="Ecto Score (>0: rather Ecto; <0: rather Endo)", las=2)
table(ed_TCGA$EctoEndoClass, ed_TCGA$histology_class)
##
## Cervical Adenocarcinoma Cervical Squamous Cell Carcinoma
## Ecto 0 68
## Endo 32 16
## undetermined 2 60
ed_tcga_sample_bc = ed_TCGA #unique(ed_TCGA[,c("sample_barcode","Histological type","Keratinizing", "tumor_necrosis_percent", "tumor_nuclei_percent")])
rownames(ed_tcga_sample_bc) = ed_tcga_sample_bc$sample_barcode
anno_colors = list()
anno_colors[["Up/Down regulation"]] = c("Higher in Ecto"="Red","Higher in Endo"="Blue")
anno_colors[["Histological type"]] = c("Cervical Squamous Cell Carcinoma"="Orange", "Cervical Adenocarcinoma"="Green", "NA"="grey")
anno_colors[["iCluster_k3_named"]] = c("Adenocarcinoma"="blue", "Keratin-high"="red", "Keratin-low"="yellow")
anno_colors[["KRT5"]] = c("high"="red", "low"="blue")
anno_colors[["KRT7"]] = c("high"="red", "low"="blue")
anno_colors[["KRT8"]] = c("high"="red", "low"="blue")
anno_colors[["EctoEndoScore"]]=colorRampPalette(rev(brewer.pal(n = 7, name ="RdYlBu")))(100)
#anno_colors[["EctoEndoClassifier"]]=c("Ecto"=rgb(0,213,250, maxColorValue = 255),"Endo"=rgb(255,134,255, maxColorValue = 255), "undetermined"=rgb(249,168,15, maxColorValue = 255))
#anno_colors[["EctoEndoClassifier"]]=c("Ecto"="red","Endo"="blue", "undetermined"="grey")
tmp_cols = colorRampPalette(rev(brewer.pal(n = 7, name ="RdYlBu")))(100)
tmp_score_range = range(sort(ed_TCGA$ecto_endo_score))
tmp_group = factor(ed_TCGA$EctoEndoClass, levels=c("Endo","undetermined","Ecto"))
tmp_group_median = tapply(ed_TCGA$ecto_endo_score, tmp_group, median)
#final_classifier_cols = tmp_cols[(cumsum(table(tmp_group))-round(table(tmp_group)/2))]
final_classifier_cols = tmp_cols[round((tmp_group_median-tmp_score_range[1])/diff(tmp_score_range) * 100)]
names(final_classifier_cols) = levels(tmp_group)
anno_colors[["EctoEndoClassifier"]] =final_classifier_cols
anno_colors[["TP63"]] = c("high"="red", "low"="blue")
anno_colors[["CLIN.UCEC.like"]] = c("Yes"="red","No"="blue")
anno_colors_default = anno_colors
Here we show the expression of all genes from TCGA cervix cancer data belonging to the Ecto vs. Endo significantly regulated genes in human ecto vs. endocervix differentiated organoids.
### Genes ordered by log2FC from Ecto vs. Endo in MB194, samples clustered by similarity
sel_genes = dedup_DGE_results(subset(all_gene_signatures[["MB194_Ecto_vs_Endo_Diff"]], !is.na(EntrezID)))
rownames(sel_genes) = sel_genes$EntrezID
e2s = unique(sel_genes[,c("EntrezID","GeneSymbol")])
rownames(e2s) = as.character(e2s$EntrezID)
sel_genes_ecto_endo_in_TCGA = subset(sel_genes, as.character(EntrezID) %in% rownames(expr_mat_TCGA_norm))
norm_mat_sig_genes = log2(expr_mat_TCGA_norm[rownames(expr_mat_TCGA_norm) %in% as.character(sel_genes$EntrezID), sel_samples]+1e-1)
rownames(norm_mat_sig_genes) = e2s[as.character(rownames(norm_mat_sig_genes)),"EntrezID"]
ee = ed_TCGA[colnames(norm_mat_sig_genes),]
colnames(norm_mat_sig_genes) = paste(ee$sample_barcode,sep="_")
norm_mat_sig_genes_sorted2 = norm_mat_sig_genes[order(sel_genes[rownames(norm_mat_sig_genes),"logFC"], decreasing=T),order(colnames(norm_mat_sig_genes))]
row_anno = sel_genes[rownames(norm_mat_sig_genes_sorted2), c("logFC"), drop=F]
row_anno_cat = row_anno
row_anno_cat$"Up/Down regulation" = ifelse(row_anno_cat$logFC > 0, "Higher in Ecto", "Higher in Endo")
row_anno_cat[["logFC"]] = NULL
g = all_gene_signatures[["MB194_SC_Vs_Diff_Ecto"]][rownames(norm_mat_sig_genes_sorted2),"group"]
#ed_tcga_sample_bc = unique(ed_TCGA[,c("sample_barcode","histological_type","keratinization_squamous_cell")])
ed_sorted = ed_tcga_sample_bc[colnames(norm_mat_sig_genes_sorted2), ]
col_anno_df = ed_sorted[,c("Histological type","CLIN.Dx_merged", "KRT5_class", "KRT7_class","SAMP.Purity_Absolute","TP63_class","EctoEndoClass","ecto_endo_score", "iCluster_k3_named")]
rownames(col_anno_df) = ed_sorted$sample_barcode
colnames(col_anno_df) = gsub("EctoEndoClass","EctoEndoClassifier",gsub("ecto_endo_score","EctoEndoScore",gsub("_class$","",colnames(col_anno_df))))
anno_colors = anno_colors_default
nmat_scaled = t(scale(t(norm_mat_sig_genes_sorted2)))
nmat_scaled = nmat_scaled[apply(is.na(nmat_scaled), 1, sum)==0, ]
nmat_scaled[abs(nmat_scaled) > 3] <- sign(nmat_scaled[abs(nmat_scaled) > 3]) * 3
breaks_new = c(-2, seq(-1,1,2/98), 2)
# pheatmap(nmat_scaled, cluster_rows = F, cluster_cols = T, show_rownames = F, scale="none", main="Signif. genes in Ecto vs. Endo Whole organoids, TCGA Cervical cancer samples", breaks=breaks_new, annotation_row = row_anno_cat, annotation_colors = anno_colors, annotation_col = col_anno_df, show_colnames = F)
breaks_new = c(-2, seq(-1,1,2/98), 2)
pheatmap(nmat_scaled[, order(ed_tcga_sample_bc[colnames(nmat_scaled), "ecto_endo_score"])], cluster_rows = F, cluster_cols = F, show_rownames = F, scale="none", main="Signif. genes in Ecto vs. Endo Whole organoids, TCGA Cervical cancer samples", breaks=breaks_new, annotation_row = row_anno_cat, annotation_colors = anno_colors, annotation_col = col_anno_df, show_colnames = F)
par_orig = par()
#image_file = paste(result_folder, "Supp Fig5 TCGA Ecto Endo Signature.tiff" ,sep="/")
#tiff(image_file, width=23*100, height=15 * 100, antialias=F)
image_file = paste(result_folder, "Fig 7a TCGA Ecto Endo Signature.png" ,sep="/")
#png(image_file, width=2300, height=2500)
#par(mar=c(14,10,4,2))
col_anno_df3 = col_anno_df
col_anno_df3[["SAMP.Purity_Absolute"]] = NULL
col_anno_df3[["CLIN.Dx_merged"]] = NULL
colnames(col_anno_df3) = gsub("iCluster_k3_named", "TCGA_iCluster_k3", colnames(col_anno_df3))
anno_colors[["TCGA_iCluster_k3"]] = anno_colors[["iCluster_k3_named"]]
mat_ordered = nmat_scaled[, order(ed_tcga_sample_bc[colnames(nmat_scaled), "ecto_endo_score"])]
tt = ed_tcga_sample_bc[colnames(mat_ordered), "EctoEndoClass"]
gaps_col = which(tt[1:length(tt)]!=tt[c(2:length(tt), length(tt))])
breaks_new = c(-2, seq(-1,1,2/98), 2)
pheatmap(mat_ordered, cluster_rows = F, cluster_cols = F, show_rownames = F, scale="none", main="Signif. genes in Ecto vs. Endo Whole organoids, TCGA Cervical cancer samples", breaks=breaks_new, annotation_row = row_anno_cat, annotation_colors = anno_colors, annotation_col = col_anno_df3, show_colnames = F, fontsize = 40, legend = F, annotation_legend=T, gaps_col = gaps_col, filename=image_file, width=23, height=25)
#dev.off()
par(par_orig)
## Warning in par(par_orig): graphical parameter "cin" cannot be set
## Warning in par(par_orig): graphical parameter "cra" cannot be set
## Warning in par(par_orig): graphical parameter "csi" cannot be set
## Warning in par(par_orig): graphical parameter "cxy" cannot be set
## Warning in par(par_orig): graphical parameter "din" cannot be set
## Warning in par(par_orig): graphical parameter "page" cannot be set
spearman_r = cor(t(nmat_scaled), ed_tcga_sample_bc[colnames(nmat_scaled),"ecto_endo_score"], method="spearman", use="pairwise")
signature_genes_df = data.frame(spearman_r, stringsAsFactors = F)
signature_genes_df$rank = rank(signature_genes_df$spearman_r)
anno = entrez_to_symbol(rownames(signature_genes_df))
## Warning in dcast(select(org.Hs.eg.db, keys = tmp, keytype = "ENTREZID", : The dcast generic in data.table has been passed a data.frame and will attempt to redirect to the reshape2::dcast; please note
## that reshape2 is deprecated, and this redirection is now deprecated as well. Please do this redirection yourself like reshape2::dcast(select(org.Hs.eg.db, keys = tmp, keytype = "ENTREZID", columns =
## "SYMBOL")). In the next version, this warning will become an error.
signature_genes_df = merge(signature_genes_df, anno, by.x=0, by.y="EntrezID", all.x=T, sort=F)
rownames(signature_genes_df) = signature_genes_df$Row.names
signature_genes_df[["Row.names"]]=NULL
signature_genes_df = signature_genes_df[order(signature_genes_df$rank, decreasing=T),]
tmp = nmat_scaled[c(head(rownames(signature_genes_df),20), tail(rownames(signature_genes_df),20)),]
tmp_ordered = tmp[, order(ed_tcga_sample_bc[colnames(tmp), "ecto_endo_score"])]
pheatmap(tmp_ordered, cluster_rows = T, cluster_cols = T, show_rownames = T, scale="none", main="Signif. genes in Ecto vs. Endo Whole organoids, TCGA Cervical cancer samples", annotation_row = row_anno_cat, annotation_colors = anno_colors, annotation_col = col_anno_df, show_colnames = F, clustering_distance_cols = "correlation", clustering_method = "complete", labels_row = signature_genes_df[rownames(tmp_ordered), "GeneSymbol"] )
#
# tmp = nmat_scaled[rownames(subset(signature_genes_df, grepl("^MUC", GeneSymbol))),]
# tmp_ordered = tmp[, order(ed_tcga_sample_bc[colnames(tmp), "ecto_endo_score"])]
# pheatmap(tmp_ordered, cluster_rows = T, cluster_cols = F, show_rownames = T, scale="none", main="Signif. genes in Ecto vs. Endo Whole organoids, TCGA Cervical cancer samples", annotation_row = row_anno_cat, annotation_colors = anno_colors, annotation_col = col_anno_df, show_colnames = F, clustering_distance_cols = "correlation", clustering_method = "complete", labels_row = signature_genes_df[rownames(tmp_ordered), "GeneSymbol"] )
library(diptest)
all_dips = list()
par(mfrow=c(1,2))
for (i in 1:nrow(tmp_ordered)) {
entrez_id = rownames(tmp_ordered)[i]
expr_vec = log2(as.numeric(expr_mat_TCGA_norm[entrez_id,sel_samples]+1e-1))
dip_res = dip.test(expr_vec)$p.value
all_dips[[rownames(tmp_ordered)[i]]] = dip_res
plot(density(expr_vec, na.rm=T), main=signature_genes_df[entrez_id, "GeneSymbol"])
plot(ed_TCGA[colnames(expr_mat_TCGA_norm[,sel_samples]), "ecto_endo_score"], expr_vec, xlab="ecto_endo_score", ylab="expression")
}
#sel_genes_bimodal = c("KRT5","KRT6A", "CSTA", "TP63", "DSC3", "DSG3", "CLCA2", "CERS3", "MUC15","MUC21", "MUC5B", "PPP1R9A", "RGL3", "TOX3", "PROM1")
sel_genes_bimodal = c("KRT5","KRT6A", "CSTA", "TP63", "DSC3", "DSG3", "CLCA2", "CERS3", "MUC5B", "PPP1R9A", "RGL3", "TOX3", "PROM1")
norm_mat_sig_genes = log2(expr_mat_TCGA_norm[, sel_samples]+1e-1)
ee = ed_TCGA[colnames(norm_mat_sig_genes),]
colnames(norm_mat_sig_genes) = paste(ee$sample_barcode,sep="_")
nmat_scaled = t(scale(t(norm_mat_sig_genes), scale=F))
nmat_scaled = nmat_scaled[apply(is.na(nmat_scaled), 1, sum)==0, ]
nmat_scaled[abs(nmat_scaled) > 3] <- sign(nmat_scaled[abs(nmat_scaled) > 3]) * 3
s2e = symbol_to_entrez(sel_genes_bimodal)
## Warning in dcast(select(org.Hs.eg.db, keys = tmp, keytype = "SYMBOL", columns = "ENTREZID"), : The dcast generic in data.table has been passed a data.frame and will attempt to redirect to the
## reshape2::dcast; please note that reshape2 is deprecated, and this redirection is now deprecated as well. Please do this redirection yourself like reshape2::dcast(select(org.Hs.eg.db, keys = tmp, keytype
## = "SYMBOL", columns = "ENTREZID")). In the next version, this warning will become an error.
rownames(s2e) = s2e$GeneSymbol
tmp = nmat_scaled[s2e[sel_genes_bimodal, ]$EntrezID,]
tmp_ordered = tmp[, order(ed_tcga_sample_bc[colnames(tmp), "ecto_endo_score"])]
pheatmap(tmp_ordered, cluster_rows = T, cluster_cols = F, show_rownames = T, scale="none", main="Signif. genes in Ecto vs. Endo Whole organoids, TCGA Cervical cancer samples", annotation_row = row_anno_cat, annotation_colors = anno_colors, annotation_col = col_anno_df, show_colnames = F, clustering_distance_cols = "correlation", clustering_method = "complete", labels_row = signature_genes_df[rownames(tmp_ordered), "GeneSymbol"] )
tmp = norm_mat_sig_genes[s2e[sel_genes_bimodal, ]$EntrezID,]
tmp_ordered = tmp[, order(ed_tcga_sample_bc[colnames(tmp), "ecto_endo_score"])]
pheatmap(tmp_ordered, cluster_rows = F, cluster_cols = F, show_rownames = T, scale="none", main="Signif. genes in Ecto vs. Endo Whole organoids, TCGA Cervical cancer samples", annotation_row = row_anno_cat, annotation_colors = anno_colors, annotation_col = col_anno_df, show_colnames = F, clustering_distance_cols = "correlation", clustering_method = "complete", labels_row = signature_genes_df[rownames(tmp_ordered), "GeneSymbol"] )
par_orig = par()
#image_file = paste(result_folder, "Supp Fig5 TCGA Ecto Endo Signature.tiff" ,sep="/")
#tiff(image_file, width=23*100, height=15 * 100, antialias=F)
image_file = paste(result_folder, "Fig 7b Bimodal genes.png" ,sep="/")
#png(image_file, width=2300, height=1500)
#par(mar=c(14,10,4,2))
col_anno_df3 = col_anno_df
col_anno_df3[["SAMP.Purity_Absolute"]] = NULL
col_anno_df3[["CLIN.Dx_merged"]] = NULL
colnames(col_anno_df3) = gsub("iCluster_k3_named", "TCGA_iCluster_k3", colnames(col_anno_df3))
anno_colors[["TCGA_iCluster_k3"]] = anno_colors[["iCluster_k3_named"]]
tmp = nmat_scaled[s2e[sel_genes_bimodal, ]$EntrezID,]
tmp_ordered = tmp[, order(ed_tcga_sample_bc[colnames(tmp), "ecto_endo_score"])]
pheatmap(tmp_ordered, cluster_rows = T, cluster_cols = F, show_rownames = T, scale="none", main="Signif. genes in Ecto vs. Endo Whole organoids, TCGA Cervical cancer samples", annotation_row = row_anno_cat, annotation_colors = anno_colors, annotation_col = col_anno_df3, show_colnames = F, clustering_distance_cols = "correlation", clustering_method = "complete", labels_row = signature_genes_df[rownames(tmp_ordered), "GeneSymbol"], fontsize = 20, filename = image_file, width=17, height=15)
#breaks_new = c(-2, seq(-1,1,2/98), 2)
#pheatmap(nmat_scaled[, order(ed_tcga_sample_bc[colnames(nmat_scaled), "ecto_endo_score"])], cluster_rows = F, cluster_cols = F, show_rownames = F, scale="none", main="Signif. genes in Ecto vs. Endo Whole organoids, TCGA Cervical cancer samples", breaks=breaks_new, annotation_row = row_anno_cat, annotation_colors = anno_colors, annotation_col = col_anno_df, show_colnames = F, fontsize = 40, legend = F, annotation_legend=T)
#dev.off()
par(par_orig)
## Warning in par(par_orig): graphical parameter "cin" cannot be set
## Warning in par(par_orig): graphical parameter "cra" cannot be set
## Warning in par(par_orig): graphical parameter "csi" cannot be set
## Warning in par(par_orig): graphical parameter "cxy" cannot be set
## Warning in par(par_orig): graphical parameter "din" cannot be set
## Warning in par(par_orig): graphical parameter "page" cannot be set
genes_ordered = c("CD63","KRT7","GDA","MMP7","AGR2","KRT5")
sel_genes = symbol_to_entrez(genes_ordered)
## Warning in dcast(select(org.Hs.eg.db, keys = tmp, keytype = "SYMBOL", columns = "ENTREZID"), : The dcast generic in data.table has been passed a data.frame and will attempt to redirect to the
## reshape2::dcast; please note that reshape2 is deprecated, and this redirection is now deprecated as well. Please do this redirection yourself like reshape2::dcast(select(org.Hs.eg.db, keys = tmp, keytype
## = "SYMBOL", columns = "ENTREZID")). In the next version, this warning will become an error.
rownames(sel_genes) = sel_genes$EntrezID
norm_mat_sig_genes = log2(expr_mat_TCGA_norm[rownames(expr_mat_TCGA_norm) %in% as.character(sel_genes$EntrezID), sel_samples]+1e-1)
#rownames(norm_mat_sig_genes) = sel_genes[as.character(rownames(norm_mat_sig_genes)),"EntrezID"]
rownames(norm_mat_sig_genes) = sel_genes[as.character(rownames(norm_mat_sig_genes)),"GeneSymbol"]
ee = ed_TCGA[colnames(norm_mat_sig_genes),]
colnames(norm_mat_sig_genes) = paste(ee$sample_barcode,sep="_")
norm_mat_sig_genes_sorted2 = norm_mat_sig_genes[,order(colnames(norm_mat_sig_genes))]
# Median center expression values
nmat_scaled = t(scale(t(norm_mat_sig_genes_sorted2[genes_ordered,]),scale=F, center=F))
norm_mat_ts = melt(nmat_scaled)
## Warning in melt(nmat_scaled): The melt generic in data.table has been passed a matrix and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this
## redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(nmat_scaled). In
## the next version, this warning will become an error.
colnames(norm_mat_ts) = c("Gene","SampleID","Expression_Z_score")
norm_mat_ts$HistoClass = factor(ed_sorted[as.character(norm_mat_ts$SampleID),"histology_class"], levels=c("Cervical Adenocarcinoma", "Cervical Squamous Cell Carcinoma"))
myScale = scale_fill_manual(name = "Histology", values = c("Cervical Squamous Cell Carcinoma" ="red","Cervical Adenocarcinoma"="blue"), labels = c("Cervical Squamous Cell Carcinoma"="Cervical Squamous Cell Carcinoma","Cervical Adenocarcinoma"="Cervical Adenocarcinoma"))
ggplot(data=norm_mat_ts ) + geom_boxplot(aes(x=HistoClass, y = Expression_Z_score, fill=HistoClass)) + facet_grid(. ~ Gene ) + myScale + theme(text = element_text(size=12), axis.text.x = element_text(angle=45, vjust=1, hjust = 1),strip.text.x = element_text(size = 12, colour = "black", angle = 0)) + xlab("Histology") + ylab("Mean adjusted expression level")
SCJ_global_p_values = list()
for (g in levels(factor(norm_mat_ts$Gene))) {
tmp = subset(norm_mat_ts, as.character(Gene)==g)
res = wilcox.test(Expression_Z_score ~ HistoClass, tmp)
SCJ_global_p_values[[g]] = res$p.value
}
res_df = t(as.data.frame(SCJ_global_p_values))
colnames(res_df) = c("Mann-Whitney p-Value")
kable(res_df, format = "markdown")
| Mann-Whitney p-Value | |
|---|---|
| CD63 | 4.10e-06 |
| KRT7 | 9.54e-05 |
| GDA | 2.60e-06 |
| MMP7 | 6.19e-05 |
| AGR2 | 0.00e+00 |
| KRT5 | 0.00e+00 |
par_orig = par()
image_file = paste(result_folder, "Fig. 7d TCGA_SCJ_Markers Boxplot.tiff" ,sep="/")
tiff(image_file, width=12*100, height=8 * 100)
#par(mar=c(14,14,4,2))
norm_mat_ts$HistoClassShort = ifelse(norm_mat_ts$HistoClass == "Cervical Squamous Cell Carcinoma", "Squamous carcionma", "Adenocarcinoma")
myScale = scale_fill_manual(name = "Histology", values = c("Cervical Squamous Cell Carcinoma"="#f8766d","Cervical Adenocarcinoma"="#00bfc4"), labels = c("Cervical Squamous Cell Carcinoma"="Cervical Squamous Cell Carcinoma","Cervical Adenocarcinoma"="Cervical Adenocarcinoma"))
ggplot(data=norm_mat_ts ) + geom_boxplot(aes(x=HistoClassShort, y = Expression_Z_score, fill=HistoClass)) + facet_grid(. ~ Gene ) + myScale + theme(panel.border = element_rect( colour = "black", fill=NA)) + theme(text = element_text(size=26), axis.text.x = element_text(size=30,angle=45, vjust=1, hjust = 1), axis.text.y = element_text(size=26), strip.text.x = element_text(size = 30, colour = "black", angle = 0), panel.grid.major = element_line(colour = "white"), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), panel.grid.minor.y = element_blank()) + xlab("Tissue Type") + ylab("Expression level") + theme(plot.margin=unit(c(1,1,1.5,8),"cm"))
#panel.background = element_rect(fill = "lightgrey"),
dev.off()
par(par_orig)
## Warning in par(par_orig): graphical parameter "cin" cannot be set
## Warning in par(par_orig): graphical parameter "cra" cannot be set
## Warning in par(par_orig): graphical parameter "csi" cannot be set
## Warning in par(par_orig): graphical parameter "cxy" cannot be set
## Warning in par(par_orig): graphical parameter "din" cannot be set
## Warning in par(par_orig): graphical parameter "page" cannot be set
table(ed_TCGA$SAMP.mRNAseq_k3, ed_TCGA$EctoEndoClass, dnn=c("TCGA RNA clusters, k=3", "Sq/Co cluster"))
## Sq/Co cluster
## TCGA RNA clusters, k=3 Ecto Endo undetermined
## C1 0 46 1
## C2 67 0 29
## C3 1 2 32
table(ed_TCGA$SAMP.miRNA_k6, ed_TCGA$EctoEndoClass, dnn=c("TCGA miRNA clusters, k=6","Sq/Co cluster"))
## Sq/Co cluster
## TCGA miRNA clusters, k=6 Ecto Endo undetermined
## C1 17 4 15
## C2 20 0 3
## C3 20 9 30
## C4 10 0 9
## C5 0 27 3
## C6 1 8 2
table(ed_TCGA$SAMP.iCluster_All_k3, ed_TCGA$EctoEndoClass, dnn=c("TCGA iCluster clusters, k=3","Sq/Co cluster"))
## Sq/Co cluster
## TCGA iCluster clusters, k=3 Ecto Endo undetermined
## C1 11 7 32
## C2 57 0 29
## C3 0 41 1
table(ed_TCGA$SAMP.iCluster_Squam_k2, ed_TCGA$EctoEndoClass, dnn=c("TCGA iCluster Squam clusters, k=2","Sq/Co cluster"))
## Sq/Co cluster
## TCGA iCluster Squam clusters, k=2 Ecto Endo undetermined
## C1 66 0 31
## C2 2 16 29
table(ed_TCGA$SAMP.iCluster_Adeno_k2, ed_TCGA$EctoEndoClass, dnn=c("TCGA iCluster Adeno clusters, k=2","Sq/Co cluster"))
## Sq/Co cluster
## TCGA iCluster Adeno clusters, k=2 Ecto Endo undetermined
## C1 0 18 0
## C2 0 12 1
ftable(ed_TCGA$histology_class, ed_TCGA$SAMP.iCluster_All_k3, ed_TCGA$EctoEndoClass, dnn=c("Histology","TCGA iCluster clusters, k=3","Sq/Co cluster"))
## Sq/Co cluster Ecto Endo undetermined
## Histology TCGA iCluster clusters, k=3
## Cervical Adenocarcinoma C1 0 1 2
## C2 0 0 0
## C3 0 31 0
## Cervical Squamous Cell Carcinoma C1 11 6 30
## C2 57 0 29
## C3 0 10 1
ftable(ed_TCGA$histology_class, ed_TCGA$iCluster_k3_named, ed_TCGA$EctoEndoClass, dnn=c("Histology","TCGA iCluster clusters, k=3","Sq/Co cluster"))
## Sq/Co cluster Ecto Endo undetermined
## Histology TCGA iCluster clusters, k=3
## Cervical Adenocarcinoma Adenocarcinoma 0 31 0
## Keratin-high 0 0 0
## Keratin-low 0 1 2
## Cervical Squamous Cell Carcinoma Adenocarcinoma 0 10 1
## Keratin-high 57 0 29
## Keratin-low 11 6 30
pdf(file=file.path(result_folder, "Ext Fig_8c_Cluster_Cell_Purity.pdf"), width=6, height=6)
ggplot(ed_TCGA, aes(col=EctoEndoClass, x=SAMP.Purity_Absolute)) + geom_density(size=1.5)
boxplot(ed_TCGA$SAMP.Purity_Absolute ~ ed_TCGA$EctoEndoClass, xlab="Sample Purity Estimate by ABSOLUTE")
dev.off()
## png
## 2
with(subset(ed_TCGA, SAMP.Purity_Absolute > 0.5), ftable(histology_class, iCluster_k3_named, EctoEndoClass, dnn=c("Histology","TCGA iCluster clusters, k=3","Sq/Co cluster")))
## Sq/Co cluster Ecto Endo undetermined
## Histology TCGA iCluster clusters, k=3
## Cervical Adenocarcinoma Adenocarcinoma 0 29 0
## Keratin-high 0 0 0
## Keratin-low 0 0 2
## Cervical Squamous Cell Carcinoma Adenocarcinoma 0 8 1
## Keratin-high 44 0 15
## Keratin-low 11 4 20
with(subset(ed_TCGA, SAMP.Purity_Absolute < 0.5), ftable(histology_class, iCluster_k3_named, EctoEndoClass, dnn=c("Histology","TCGA iCluster clusters, k=3","Sq/Co cluster")))
## Sq/Co cluster Ecto Endo undetermined
## Histology TCGA iCluster clusters, k=3
## Cervical Adenocarcinoma Adenocarcinoma 0 2 0
## Keratin-high 0 0 0
## Keratin-low 0 1 0
## Cervical Squamous Cell Carcinoma Adenocarcinoma 0 2 0
## Keratin-high 10 0 13
## Keratin-low 0 1 9
with(ed_TCGA, ftable(histology_class, factor(SAMP.Purity_Absolute < 0.5, levels=c(T,F), labels=c("Low", "High")), EctoEndoClass, dnn=c("Histology","Sample purity","Sq/Co cluster")))
## Sq/Co cluster Ecto Endo undetermined
## Histology Sample purity
## Cervical Adenocarcinoma Low 0 3 0
## High 0 29 2
## Cervical Squamous Cell Carcinoma Low 10 3 22
## High 58 13 38
write.table(ed_TCGA, file=paste(result_folder, "ExpDesign_TCGA_with_scores.txt", sep="/"), sep="\t", row.names=F, quote=F)
#sel_cols = c("sample_barcode","histology_class","keratinization_squamous_cell","tumor_grade","histology_class","KRT5","KRT7","KRT5_class","KRT7_class","KRT_class","TP63","TP63_class","ecto_endo_score","EctoEndoClass")
sel_cols = c("sample_barcode","iCluster_k3_named","CLIN.Dx_merged","histology_class","CLIN.HPV_status","CLIN.HPV_int1","SAMP.Purity_Absolute","KRT5_class","KRT7_class","TP63_class","ecto_endo_score","EctoEndoClass")
write.table(ed_TCGA[, sel_cols], file=paste(result_folder, "ExpDesign_TCGA_with_scores_short.txt", sep="/"), sep="\t", row.names=F, quote=F)
write.table(sel_genes_ecto_endo_in_TCGA, file=paste(result_folder, "EctoEndoSignature_TCGA_Fig5A.txt", sep="/"), sep="\t", row.names=F, quote=F)
library(magrittr)
library(qwraps2)
options(qwraps2_markup = "markdown")
ed_TCGA_grouped_ecto_endo <-
dplyr::mutate(ed_TCGA,
group_factor = factor(EctoEndoClass,
levels = c("Endo", "Ecto","undetermined"),
labels = c("Columnar-like","Squamous-like","undetermined"))
)
summary2 = ed_TCGA_grouped_ecto_endo %>%
dplyr::select(.data$iCluster_k3_named, .data$CLIN.Dx_merged, .data$CLIN.HPV_status, .data$CLIN.HPV_int1, .data$SAMP.Purity_Absolute, .data$KRT5_class, .data$KRT7_class, .data$TP63_class) %>% qsummary(.)
names(summary2) = c("TCGA iCluster (k=3)", "Histopathological diagnosis","HPV Status","HPV integration","Sample Purity Estimate", "KRT5","KRT7","TP63")
summary_table(dplyr::group_by(ed_TCGA_grouped_ecto_endo, group_factor), summary2)
##
##
## | |group_factor: Columnar-like (N = 48) |group_factor: Squamous-like (N = 68) |group_factor: undetermined (N = 62) |
## |:-------------------------------|:------------------------------------|:------------------------------------|:-----------------------------------|
## |**TCGA iCluster (k=3)** | | | |
## | Adenocarcinoma |41 (85) |0 (0) |1 (2) |
## | Keratin-high |0 (0) |57 (84) |29 (47) |
## | Keratin-low |7 (15) |11 (16) |32 (52) |
## |**Histopathological diagnosis** | | | |
## | Adenocarcinoma |30 (62) |0 (0) |1 (2) |
## | Adenosquamous |2 (4) |0 (0) |1 (2) |
## | Squamous |16 (33) |68 (100) |60 (97) |
## |**HPV Status** | | | |
## | negative |7 (15) |2 (3) |0 (0) |
## | positive |41 (85) |66 (97) |62 (100) |
## |**HPV integration** | | | |
## | No |13 (27) |17 (25) |7 (11) |
## | Yes |35 (73) |51 (75) |55 (89) |
## |**Sample Purity Estimate** | | | |
## | minimum |0.38 |0.25 |0.22 |
## | median (IQR) |0.73 (0.64, 0.81) |0.69 (0.56, 0.79) |0.57 (0.43, 0.74) |
## | mean (sd) |0.71 ± 0.14 |0.67 ± 0.16 |0.59 ± 0.19 |
## | maximum |0.94 |0.96 |0.95 |
## |**KRT5** | | | |
## | high |4 (8) |68 (100) |54 (87) |
## | low |44 (92) |0 (0) |8 (13) |
## |**KRT7** | | | |
## | high |46 (96) |38 (56) |45 (73) |
## | low |2 (4) |30 (44) |17 (27) |
## |**TP63** | | | |
## | high |9 (19) |68 (100) |60 (97) |
## | low |39 (81) |0 (0) |2 (3) |
sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.7 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=en_US.UTF-8 LC_ADDRESS=en_US.UTF-8 LC_TELEPHONE=en_US.UTF-8 LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] qwraps2_0.4.2 magrittr_1.5 diptest_0.75-7 BiocParallel_1.12.0 DESeq2_1.18.1 SummarizedExperiment_1.8.1 DelayedArray_0.4.1
## [8] matrixStats_0.56.0 org.Hs.eg.db_3.5.0 AnnotationDbi_1.40.0 Biobase_2.38.0 rms_5.1-2 SparseM_1.78 Hmisc_4.4-0
## [15] Formula_1.2-3 survival_3.2-3 lattice_0.20-41 rrcov_1.5-2 robustbase_0.93-6 pander_0.6.3 GenomicRanges_1.30.3
## [22] GenomeInfoDb_1.14.0 IRanges_2.12.0 S4Vectors_0.16.0 BiocGenerics_0.24.0 data.table_1.12.8 ggplot2_3.3.2 naturalsort_0.1.3
## [29] scatterplot3d_0.3-41 RColorBrewer_1.1-2 impute_1.52.0 gplots_3.0.4 xtable_1.8-4 GSA_1.03.1 pheatmap_1.0.12
## [36] reshape2_1.4.4 xlsx_0.6.3 limma_3.34.9 rmarkdown_2.3 knitr_1.29
##
## loaded via a namespace (and not attached):
## [1] TH.data_1.0-10 colorspace_1.4-1 ellipsis_0.3.1 htmlTable_2.0.1 XVector_0.18.0 base64enc_0.1-3 rstudioapi_0.11 farver_2.0.3
## [9] MatrixModels_0.4-1 bit64_0.9-7 mvtnorm_1.0-7 codetools_0.2-16 splines_3.4.3 geneplotter_1.56.0 annotate_1.56.2 rJava_0.9-9
## [17] cluster_2.1.0 compiler_3.4.3 backports_1.1.8 Matrix_1.2-18 acepack_1.4.1 htmltools_0.5.0 quantreg_5.55 tools_3.4.3
## [25] gtable_0.3.0 glue_1.4.1 GenomeInfoDbData_1.0.0 dplyr_1.0.0 Rcpp_1.0.4.6 vctrs_0.3.1 gdata_2.18.0 nlme_3.1-148
## [33] xfun_0.15 stringr_1.4.0 xlsxjars_0.6.1 lifecycle_0.2.0 gtools_3.8.2 XML_3.98-1.10 polspline_1.1.19 DEoptimR_1.0-8
## [41] zlibbioc_1.24.0 MASS_7.3-51.6 zoo_1.8-8 scales_1.1.1 sandwich_2.5-1 yaml_2.2.1 memoise_1.1.0 gridExtra_2.3
## [49] rpart_4.1-15 latticeExtra_0.6-28 stringi_1.4.6 RSQLite_2.2.0 highr_0.8 genefilter_1.60.0 pcaPP_1.9-73 checkmate_2.0.0
## [57] caTools_1.17.1.3 rlang_0.4.6 pkgconfig_2.0.3 bitops_1.0-6 evaluate_0.14 purrr_0.3.4 labeling_0.3 htmlwidgets_1.5.1
## [65] bit_1.1-15.2 tidyselect_1.1.0 plyr_1.8.6 R6_2.4.1 generics_0.0.2 multcomp_1.4-8 DBI_1.1.0 pillar_1.4.4
## [73] foreign_0.8-74 withr_2.2.0 RCurl_1.98-1.2 nnet_7.3-14 tibble_3.0.1 crayon_1.3.4 KernSmooth_2.23-17 locfit_1.5-9.1
## [81] grid_3.4.3 blob_1.2.1 digest_0.6.25 munsell_0.5.0